Variables:
Risk Age Sex Country
library(data.table)
library(tidyr)
#read the data (Wave 5)
# Data of Wave 5
WV5_data <- readRDS("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/F00007944-WV5_Data_R_v20180912.rds")
# Convert WV5_data-object in data.frame
WV5_data_df <- as.data.frame(WV5_data)
# show first five columns
head(WV5_data_df[, 1:5])
library(dplyr)
#rename the variables
WV5_data <- WV5_data_df %>%
rename(sex = V235, age = V237, country = V2, wave = V1, risk = V86)
WV5_data
#select only the variables of interest
WV5_data <- WV5_data %>%
select(sex, age, country, wave, risk)
WV5_data
#decode the country names
countrynames = read.csv("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV5_data$country_lab = countrynames$name [match(WV5_data$country, countrynames$code)]
table(WV5_data$country_lab)
Andorra Argentina Australia
1003 1002 1421
Brazil Bulgaria Burkina Faso
1500 1001 1534
Canada Chile China
2164 1000 1991
Colombia Cyprus (G) Egypt
3025 1050 3051
Ethiopia Finland France
1500 1014 1001
Georgia Germany Ghana
1500 2064 1534
Great Britain Guatemala Hong Kong
1041 1000 1252
Hungary India Indonesia
1007 2001 2015
Iran Iraq Italy
2667 2701 1012
Japan Jordan Malaysia
1096 1200 1201
Mali Mexico Moldova
1534 1560 1046
Morocco Netherlands New Zealand
1200 1050 954
Norway Peru Poland
1025 1500 1000
Romania Russia Rwanda
1776 2033 1507
Slovenia South Africa South Korea
1037 2988 1200
Spain Sweden Switzerland
1200 1003 1241
Taiwan Thailand Trinidad and Tobago
1227 1534 1002
Turkey Ukraine United States
1346 1000 1249
Uruguay Viet Nam Zambia
1000 1495 1500
WV5_data
NA
NA
#Read Dataset (Wave 6)
WV6_data <- load("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/WV6_Data_R_v20201117.rdata")
WV6_data <- WV6_Data_R_v20201117
print(WV6_data)
#rename variables
WV6_data <- WV6_data %>%
rename(wave = V1, sex = V240, age = V242,country = V2, risk = V76)
#select only the variables of interest
WV6_data <- WV6_data %>%
select(wave, sex, age, country, sex,risk)
WV6_data
NA
#decode daraset (Wave 6)
countrynames = read.csv("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV6_data$country_lab = countrynames$name [match(WV6_data$country, countrynames$code)]
table(WV6_data$country_lab)
Algeria Argentina Armenia
1200 1030 1100
Australia Azerbaijan Belarus
1477 1002 1535
Brazil Chile China
1486 1000 2300
Colombia Cyprus (G) Ecuador
1512 1000 1202
Egypt Estonia Georgia
1523 1533 1202
Germany Ghana Haiti
2046 1552 1996
Hong Kong India Iraq
1000 4078 1200
Japan Jordan Kazakhstan
2443 1200 1500
Kuwait Kyrgyzstan Lebanon
1303 1500 1200
Libya Malaysia Mexico
2131 1300 2000
Morocco Netherlands New Zealand
1200 1902 841
Nigeria Pakistan Palestine
1759 1200 1000
Peru Philippines Poland
1210 1200 966
Qatar Romania Russia
1060 1503 2500
Rwanda Singapore Slovenia
1527 1972 1069
South Africa South Korea Spain
3531 1200 1189
Sweden Taiwan Thailand
1206 1238 1200
Trinidad and Tobago Tunisia Turkey
999 1205 1605
Ukraine United States Uruguay
1500 2232 1000
Uzbekistan Yemen Zimbabwe
1500 1000 1500
WV6_data
#combine the 2 dataset (Wave 6 + Wave 5)
WV5_data
WV6_data
data = rbind(WV5_data, WV6_data)
data
#number of countries
length(unique(data$country_lab))
[1] 80
#exclusion of participants and omission of missing data (na)
data = subset(data, risk > 0 & sex > 0 & age >0 )
data_Wave5 = subset(WV5_data, risk > 0 & sex > 0 & age >0 )
data_Wave6 = subset(WV6_data, risk > 0 & sex > 0 & age >0)
data <- na.omit(data)
data_Wave5 <- na.omit(data_Wave5)
data_Wave6 <- na.omit(data_Wave6)
#number of participants per Wave
nrow(data)
[1] 156528
nrow(data_Wave5)
[1] 70308
nrow(data_Wave6)
[1] 86220
# Load the dplyr package
library(dplyr)
# Assuming the data frame is called 'data' and the column containing the country information is called 'country'
country_counts <- data %>%
count(country_lab)
# Print the result
print(country_counts)
NA
#number and list and participants pro countries pro Wave (including both Waves together)
table(data_Wave5$country_lab)
Andorra Argentina Australia
1001 981 1381
Brazil Bulgaria Burkina Faso
1492 942 1332
Canada Chile China
2121 969 1898
Cyprus (G) Egypt Ethiopia
1042 3026 1481
Finland France Georgia
1013 995 1451
Germany Ghana Great Britain
2019 1513 1036
Hungary India Indonesia
1003 1575 1942
Iran Japan Jordan
2615 1032 1163
Malaysia Mali Mexico
1200 1312 1505
Moldova Morocco Netherlands
1028 1145 1046
Norway Peru Poland
1019 1430 989
Romania Russia Rwanda
1583 1970 1409
Slovenia South Africa South Korea
1008 2945 1200
Spain Sweden Switzerland
1184 997 1233
Taiwan Thailand Trinidad and Tobago
1225 1514 997
Turkey Ukraine United States
1303 967 1219
Uruguay Viet Nam Zambia
989 1416 1452
length(unique(data_Wave5$country_lab))
[1] 51
table(data_Wave6$country_lab)
Algeria Argentina Armenia
1115 1011 1090
Australia Azerbaijan Belarus
1441 1002 1528
Brazil Chile China
1481 914 2167
Colombia Cyprus (G) Ecuador
1506 993 1201
Egypt Estonia Georgia
1523 1509 1190
Germany Ghana Haiti
2024 1552 1976
Hong Kong India Iraq
977 3472 1187
Japan Jordan Kazakhstan
2201 1195 1500
Kuwait Kyrgyzstan Lebanon
1190 1497 1177
Libya Malaysia Mexico
2043 1300 1996
Morocco Netherlands New Zealand
1035 1813 802
Nigeria Pakistan Palestine
1759 1176 974
Peru Philippines Poland
1158 1199 950
Qatar Romania Russia
1052 1436 1806
Rwanda Singapore Slovenia
1527 1938 1051
South Africa South Korea Spain
3481 1182 1173
Sweden Taiwan Thailand
1200 1195 1160
Trinidad and Tobago Tunisia Turkey
983 1097 1573
Ukraine United States Uruguay
1500 2189 991
Uzbekistan Yemen Zimbabwe
1433 929 1500
length(unique(data_Wave6$country_lab))
[1] 60
length(unique(data$country_lab))
[1] 77
table(data$country_lab)
Algeria Andorra Argentina
1115 1001 1992
Armenia Australia Azerbaijan
1090 2822 1002
Belarus Brazil Bulgaria
1528 2973 942
Burkina Faso Canada Chile
1332 2121 1883
China Colombia Cyprus (G)
4065 1506 2035
Ecuador Egypt Estonia
1201 4549 1509
Ethiopia Finland France
1481 1013 995
Georgia Germany Ghana
2641 4043 3065
Great Britain Haiti Hong Kong
1036 1976 977
Hungary India Indonesia
1003 5047 1942
Iran Iraq Japan
2615 1187 3233
Jordan Kazakhstan Kuwait
2358 1500 1190
Kyrgyzstan Lebanon Libya
1497 1177 2043
Malaysia Mali Mexico
2500 1312 3501
Moldova Morocco Netherlands
1028 2180 2859
New Zealand Nigeria Norway
802 1759 1019
Pakistan Palestine Peru
1176 974 2588
Philippines Poland Qatar
1199 1939 1052
Romania Russia Rwanda
3019 3776 2936
Singapore Slovenia South Africa
1938 2059 6426
South Korea Spain Sweden
2382 2357 2197
Switzerland Taiwan Thailand
1233 2420 2674
Trinidad and Tobago Tunisia Turkey
1980 1097 2876
Ukraine United States Uruguay
2467 3408 1980
Uzbekistan Viet Nam Yemen
1433 1416 929
Zambia Zimbabwe
1452 1500
data$risk = 6 - data$risk + 1
data$risk_ord=data$risk
data$risk = 10*scale(data$risk, center=TRUE,scale=TRUE)+50
data
NA
NA
#number of males vs females (1 = males; 2 = females)
table(data$sex)
female male
81384 75144
table(data_Wave5$sex)
female male
36520 33788
table(data_Wave6$sex)
female male
44864 41356
#create a categorical age variable
data$agecat[data$age<20]="15-19"
data$agecat[data$age>=20 & data$age <30] = "20-29"
data$agecat[data$age>=30 & data$age <40] = "30-39"
data$agecat[data$age>=40 & data$age <50] = "40-49"
data$agecat[data$age>=50 & data$age <60] = "50-59"
data$agecat[data$age>=60 & data$age <70] = "60-69"
data$agecat[data$age>=70 & data$age <80] = "70-79"
data$agecat[data$age>=80] = "80+"
#gender variables
data$sex[data$sex == 1] <- "male"
data$sex[data$sex == 2] <- "female"
data_Wave5$sex[data_Wave5$sex == 1] <- "male"
data_Wave5$sex[data_Wave5$sex == 2] <- "female"
data_Wave6$sex[data_Wave6$sex == 1] <- "male"
data_Wave6$sex[data_Wave6$sex == 2] <- "female"
#average age of participants
mean(data$age)
[1] 41.62343
median(data$age)
[1] 39
#wave variables
data$wave[data$wave == 5] <- "Wave 5"
data$wave[data$wave == 6] <- "Wave 6"
data
#age range
range(data$age)
[1] 15 102
range(data_Wave5$age)
[1] 15 98
range(data_Wave6$age)
[1] 16 102
#risk taking Frequency
library(ggplot2)
ggplot(data, aes(x = risk)) +
geom_histogram(binwidth = 0.5, fill = "lightblue", color = "black") +
labs(x = "Risk Taking", y = "Frequency", title = "Histogram of Risk Taking") +
theme_minimal()
#age frequency
ggplot(data, aes(x = age)) +
geom_histogram(binwidth = 0.5, fill = "lightblue", color = "black") +
labs(x = "Age", y = "Frequency", title = "Histogram of Age Distributionn") +
theme_minimal()
#age vs risk taking
ggplot(data, aes(x = agecat, y = risk)) +
geom_boxplot() +
labs(title = "Boxplot of Risk and Adventure by Age",
x = "Age",
y = "Risk and Adventure") +
theme_minimal()
#sex vs risk taking
ggplot(data, aes(as.factor(sex), risk))+
geom_boxplot()
#descriptive data
summary(data)
#data cleaning: deletion of NAs
data = na.omit(data)
summary(data)
#risk distribution according to Waves 5 and 6
ggplot(data, aes(as.factor(wave), risk))+
geom_boxplot()
#risk vs age
library(ggplot2)
ggplot(data, aes(risk, age))+
geom_point()+
geom_smooth(method = "lm")
#selected 3 countries: Andorra, Romania, Spain
data1 <- subset(data, country_lab %in% c("Andorra", "Romania", "Spain"))
# 3 risk distribution for 3 countries
ggplot(data1, aes(as.factor(country_lab), risk))+
geom_boxplot()
#age vs risk depending on country
ggplot(data, aes(age, risk, color = as.factor(country_lab)))+
geom_point()+
geom_smooth(method = "lm", se = TRUE)
# read in file that contains hardship indicators manually collected from CIA factbook, WHO, and World Bank
# (see Supplemental Materials for URL sources)
countryfacts = read.csv("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/countryfacts_selection.csv", as.is = TRUE, header = TRUE)
# Create a vector of labels with the same length as the number of columns in 'countryfacts'
labels <- c("code","country","codeWVS","Homicide","GDP","InfMort","LifeExp","GINI","GenderPEdu","code2")
# Print the result
print(countryfacts)
# Load the dplyr package
library(dplyr)
# Assuming the data frame is called 'data' and the column containing the country information is called 'country'
age_counts <- data %>%
count(agecat)
# Print the result
print(age_counts)
summary(data)
sex age country wave
Length:156528 Min. : 15.00 Min. : 12.0 Length:156528
Class :character 1st Qu.: 28.00 1st Qu.:276.0 Class :character
Mode :character Median : 39.00 Median :466.0 Mode :character
Mean : 41.62 Mean :477.4
3rd Qu.: 54.00 3rd Qu.:710.0
Max. :102.00 Max. :894.0
risk.V1 country_lab risk_ord agecat
Min. :36.15574 Length:156528 Min. :1.000 Length:156528
1st Qu.:42.42763 Class :character 1st Qu.:2.000 Class :character
Median :48.69953 Mode :character Median :3.000 Mode :character
Mean :50.00000 Mean :3.207
3rd Qu.:54.97142 3rd Qu.:4.000
Max. :67.51521 Max. :6.000
# Load the dplyr package
library(dplyr)
# Count the number of occurrences of each age category for each country
agepercountries_counts <- data %>%
group_by(country_lab) %>%
count(agecat)
# Print the result
print(agepercountries_counts)
NA
# Load the dplyr package if not already loaded
if (!require(dplyr)) {
install.packages("dplyr")
library(dplyr)
}
# Count the number of occurrences of each gender for each country
sexpercountries_counts <- data %>%
group_by(country_lab) %>%
count(sex)
# Print the result
print(sexpercountries_counts)
NA
# Load the dplyr package if not already loaded
if (!require(dplyr)) {
install.packages("dplyr")
library(dplyr)
}
# Group the data by country and calculate the age distribution
age_distribution_per_country <- data %>%
group_by(country_lab) %>%
summarize(mean_age = mean(age, na.rm = TRUE),
median_age = median(age, na.rm = TRUE),
min_age = min(age, na.rm = TRUE),
max_age = max(age, na.rm = TRUE))
# Print the result
print(age_distribution_per_country)
NA
# Load the dplyr package if not already loaded
if (!require(dplyr)) {
install.packages("dplyr")
library(dplyr)
}
# Group the data by sex and calculate the mean risk for each sex
risk_by_sex <- data %>%
group_by(sex) %>%
summarize(mean_risk = mean(risk_ord, na.rm = TRUE))
# Calculate the difference in mean risk between sexes
sex_difference <- diff(risk_by_sex$mean_risk)
# Print the result
print(risk_by_sex)
print(sex_difference)
[1] 0.3975174
# Load the dplyr package if not already loaded
if (!require(dplyr)) {
install.packages("dplyr")
library(dplyr)
}
# Group the data by country_lab and calculate the mean risk for each country_lab
risk_by_country_lab <- data %>%
group_by(country_lab) %>%
summarize(mean_risk = mean(risk_ord, na.rm = TRUE))
# Print the result
print(risk_by_country_lab)
# Load the dplyr package if not already loaded
if (!require(dplyr)) {
install.packages("dplyr")
library(dplyr)
}
# Group the data by age_cat and calculate the mean risk for each age category
risk_by_agecat <- data %>%
group_by(agecat) %>%
summarize(mean_risk = mean(risk_ord, na.rm = TRUE))
# Print the result
print(risk_by_agecat)
NA
data
# Load the dplyr package if not already loaded
if (!require(dplyr)) {
install.packages("dplyr")
library(dplyr)
}
# Create the 'hardship' column in the 'countryfacts' data frame
countryfacts <- countryfacts %>%
mutate(hardship = (homiciderate + gdp + infantmortality + lifeexpectancy + gini + femalemale_primedu) / 6)
countryfacts
# View the distribution of the 'hardship_index' column for each country
hardship_index_distribution <- countryfacts %>%
group_by(label) %>%
summarize(
mean = mean(hardship, na.rm = TRUE),
median = median(hardship, na.rm = TRUE),
sd = sd(hardship, na.rm = TRUE),
min = min(hardship, na.rm = TRUE),
max = max(hardship, na.rm = TRUE),
n = sum(!is.na(hardship))
)
Warning: There were 24 warnings in `summarize()`.
The first warning was:
ℹ In argument: `min = min(hardship, na.rm = TRUE)`.
ℹ In group 2: `label = "Andorra"`.
Caused by warning in `min()`:
! no non-missing arguments to min; returning Inf
ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 23 remaining warnings.
# Print the result
print(hardship_index_distribution)
NA
NA
NA
NA
NA
NA
NA
NA
NA
NA
NA
NA
NA
countryfacts